# Generate statistics for essays on SAT EBRW for Tables 2, 3, and S2

library(caret)
library(tidyverse)

merged_df <- read.csv("merged_final.csv")

set.seed(1993)
merged_folds <- trainControl(method = "cv", number = 10)
merged_topics <- as.matrix(log(merged_df[,6:75]))

min_merged_cor <- abs(cor(merged_df$RSAT_EBRW, 
                          log(merged_df[,6:75])))
# Drop min cor, topic 16

merged_topics <- merged_topics[,-16]

merged_df <- as.data.frame(cbind(merged_df$RSAT_EBRW, merged_topics))

merged_mod <- train(V1 ~ ., method = "lm",
                    data = merged_df, trControl = merged_folds)

print(merged_mod)
summary(merged_mod)

################################################################################

creativ_df <- read.csv("creative_final.csv")

set.seed(1993)
creativ_folds <- trainControl(method = "cv", number = 10)
creativ_topics <- as.matrix(log(creativ_df[,6:55]))

min_creativ_cor <- abs(cor(creativ_df$RSAT_EBRW, 
                           log(creativ_df[,6:55])))
# Drop min cor, topic 8

creativ_topics <- creativ_topics[,-8]

creativ_df <- as.data.frame(cbind(creativ_df$RSAT_EBRW, creativ_topics))

creativ_mod <- train(V1 ~ ., method = "lm",
                     data = creativ_df, trControl = creativ_folds)

print(creativ_mod)
summary(creativ_mod)

################################################################################

signif_df <- read.csv("signif_final.csv")

set.seed(1993)
signif_folds <- trainControl(method = "cv", number = 10)
signif_topics <- as.matrix(log(signif_df[,6:55]))

min_signif_cor <- abs(cor(signif_df$RSAT_EBRW, 
                          log(signif_df[,6:55])))
# Drop min cor, topic 24

signif_topics <- signif_topics[,-24]

signif_df <- as.data.frame(cbind(signif_df$RSAT_EBRW, signif_topics))

signif_mod <- train(V1 ~ ., method = "lm",
                    data = signif_df, trControl = signif_folds)

print(signif_mod)
summary(signif_mod)

################################################################################
# k-fold CV using dictionary features as inputs
#

set.seed(1993)
merged_folds <- trainControl(method = "cv", number = 10)
merged_liwc <- as.matrix(merged_df[,76:167])

merged_df <- as.data.frame(cbind(merged_df$RSAT_EBRW, merged_liwc))

merged_mod <- train(V1 ~ ., method = "lm",
                    data = merged_df, trControl = merged_folds)

print(merged_mod)

################################################################################

set.seed(1993)
creativ_folds <- trainControl(method = "cv", number = 10)
creativ_liwc <- as.matrix(creativ_df[,56:147])

creativ_df <- as.data.frame(cbind(creativ_df$RSAT_EBRW, creativ_liwc))

creativ_mod <- train(V1 ~ ., method = "lm",
                     data = creativ_df, trControl = creativ_folds)

print(creativ_mod)

################################################################################

set.seed(1993)
signif_folds <- trainControl(method = "cv", number = 10)
signif_liwc <- as.matrix(signif_df[,56:147])

signif_df <- as.data.frame(cbind(signif_df$RSAT_EBRW, signif_liwc))

signif_mod <- train(V1 ~ ., method = "lm",
                    data = signif_df, trControl = signif_folds)

print(signif_mod)

